Setup:
knitr::opts_chunk$set(echo = TRUE, max.print = 100)
Load libraries:
library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringi)
library(Cairo)
library(ape)
library(geosphere)
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(ggrepel)
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:ape':
##
## rotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Load data:
Phonotacticon <- read_xlsx("Phonotacticon.xlsx", guess_max = 1661)
PhonoBib <- read_xlsx("PhonoBib.xlsx")
PanPhon <- read_csv("PanPhonPhonotacticon1_0.csv") %>%
filter(!duplicated(ipa))
## Rows: 7416 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ipa
## dbl (22): syl, son, cons, cont, delrel, lat, nas, strid, voi, sg, cg, ant, c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Phonotacticon
PhonoBib
PanPhon
Create PanPhon regular expression:
PanPhonOrder <- PanPhon$ipa[order(-nchar(PanPhon$ipa), PanPhon$ipa)]
PanPhonRegex <- paste0("(?:", paste(PanPhonOrder, collapse="|"), ")")
str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."
Subset Eurasian lects that are complete in Phonotacticon:
Eurasia <- Phonotacticon %>%
filter(Complete,
complete.cases(P),
O != "?",
N != "?",
C != "?",
!grepl("[A-Z]|\\[", O),
!grepl("[A-Z]|\\[", N),
!grepl("[A-Z]|\\[", C))
Extract onset, nucleus, and coda sequences:
Sequences <- Eurasia %>%
select(Lect, O, N, C) %>%
pivot_longer(-Lect, names_to = 'Category', values_to = 'Sequence') %>%
separate(col = Sequence,
sep = ' ',
into = as.character(1:500),
fill = 'right') %>%
pivot_longer(-c(Lect, Category), names_to = 'Number', values_to = 'Sequence') %>%
select(-Number) %>%
filter(!is.na(Sequence)) %>%
distinct()
Sequences
Split the sequences into segments:
Segments <- stri_extract_all_regex(Sequences$Sequence,
pattern = PanPhonRegex,
simplify = TRUE) %>%
as_tibble(.name_repair = 'unique') %>%
mutate(Lect = Sequences$Lect,
Category = Sequences$Category,
Sequence = Sequences$Sequence) %>%
pivot_longer(cols = -c(Lect, Category, Sequence),
names_to = 'Order',
values_to = 'ipa') %>%
mutate(Order = Order %>%
as.factor() %>%
as.integer()) %>%
filter(ipa != "")
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Segments
Measure the length of each sequence:
Sequences_length <- Segments %>%
count(Lect, Category, Sequence,
name = 'Length')
Sequences_length
Join the length of each sequence to segments:
Segments <- left_join(Segments, Sequences_length)
## Joining, by = c("Lect", "Category", "Sequence")
Segments
Count the maximal length:
MaxLength <- max(Sequences_length$Length)
MaxLength
## [1] 5
Count the number of split segments:
Segments_number <- nrow(Segments)
Segments_number
## [1] 9214
Assign different positions to each sequence:
Sequences_rep <- bind_rows(rep(list(Segments), 5))
Sequences_rep <- Sequences_rep %>%
mutate(Position = rep(0:(MaxLength - 1),
each = Segments_number)) %>%
mutate(Order = Order + Position) %>%
filter(Length + Position <= MaxLength) %>%
select(-Length)
Sequences_rep
Join segments with their phonological features:
Sequences_features <- Sequences_rep %>%
left_join(PanPhon, by = 'ipa') %>%
pivot_longer(cols = -c(Lect,
Category,
Sequence,
Order,
ipa,
Position),
names_to = 'Feature',
values_to = 'Value') %>%
mutate(Feature = paste0(Feature, Order)) %>%
pivot_wider(names_from = 'Feature',
values_from = 'Value') %>%
select(-Order, -ipa) %>%
pivot_longer(cols = -c(Lect, Category, Sequence, Position),
names_to = 'Feature',
values_to = 'Value') %>%
filter(Value != 'NULL') %>%
pivot_wider(names_from = 'Feature',
values_from = 'Value') %>%
replace(is.na(.), 0)
Sequences_features
Paste sequence and position together:
Sequences_SeqPos <- Sequences_features %>%
mutate(SequencePosition = paste0(Sequence, Position)) %>%
select(-Lect, -Category, -Position, -Sequence) %>%
select(SequencePosition, everything()) %>%
distinct()
Sequences_SeqPos
Calculate the distance between segments:
Sequences_distance <- Sequences_SeqPos %>%
select(-SequencePosition) %>%
dist(method = 'euclidean') %>%
as.matrix() %>%
as_tibble(.name_repair = 'unique')
Sequences_distance <- bind_cols(Sequences_SeqPos$SequencePosition,
Sequences_distance)
## New names:
## • `` -> `...1`
colnames(Sequences_distance) <- c('SequencePosition',
Sequences_SeqPos$SequencePosition)
Sequences_distance